{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. 获取数据\n",
    "# 2. 数据处理\n",
    "# 2.1 缺失值处理\n",
    "# 2.2 确定目标值和特征值\n",
    "# 2.3 数据划分\n",
    "# 3. 特征工程-特征提取\n",
    "# 4. 机器学习-决策树算法\n",
    "# 5. 模型评估"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
    "from sklearn.feature_extraction import DictVectorizer\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. 获取数据\n",
    "data = pd.read_csv(\"http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv('../../../../data/泰坦尼克.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>row.names</th>\n",
       "      <th>pclass</th>\n",
       "      <th>survived</th>\n",
       "      <th>name</th>\n",
       "      <th>age</th>\n",
       "      <th>embarked</th>\n",
       "      <th>home.dest</th>\n",
       "      <th>room</th>\n",
       "      <th>ticket</th>\n",
       "      <th>boat</th>\n",
       "      <th>sex</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1st</td>\n",
       "      <td>1</td>\n",
       "      <td>Allen, Miss Elisabeth Walton</td>\n",
       "      <td>29.0000</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>St Louis, MO</td>\n",
       "      <td>B-5</td>\n",
       "      <td>24160 L221</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1st</td>\n",
       "      <td>0</td>\n",
       "      <td>Allison, Miss Helen Loraine</td>\n",
       "      <td>2.0000</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Montreal, PQ / Chesterville, ON</td>\n",
       "      <td>C26</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>female</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1st</td>\n",
       "      <td>0</td>\n",
       "      <td>Allison, Mr Hudson Joshua Creighton</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Montreal, PQ / Chesterville, ON</td>\n",
       "      <td>C26</td>\n",
       "      <td>NaN</td>\n",
       "      <td>(135)</td>\n",
       "      <td>male</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1st</td>\n",
       "      <td>0</td>\n",
       "      <td>Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)</td>\n",
       "      <td>25.0000</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Montreal, PQ / Chesterville, ON</td>\n",
       "      <td>C26</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>female</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>1st</td>\n",
       "      <td>1</td>\n",
       "      <td>Allison, Master Hudson Trevor</td>\n",
       "      <td>0.9167</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Montreal, PQ / Chesterville, ON</td>\n",
       "      <td>C22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11</td>\n",
       "      <td>male</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   row.names pclass  survived  \\\n",
       "0          1    1st         1   \n",
       "1          2    1st         0   \n",
       "2          3    1st         0   \n",
       "3          4    1st         0   \n",
       "4          5    1st         1   \n",
       "\n",
       "                                              name      age     embarked  \\\n",
       "0                     Allen, Miss Elisabeth Walton  29.0000  Southampton   \n",
       "1                      Allison, Miss Helen Loraine   2.0000  Southampton   \n",
       "2              Allison, Mr Hudson Joshua Creighton  30.0000  Southampton   \n",
       "3  Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)  25.0000  Southampton   \n",
       "4                    Allison, Master Hudson Trevor   0.9167  Southampton   \n",
       "\n",
       "                         home.dest room      ticket   boat     sex  \n",
       "0                     St Louis, MO  B-5  24160 L221      2  female  \n",
       "1  Montreal, PQ / Chesterville, ON  C26         NaN    NaN  female  \n",
       "2  Montreal, PQ / Chesterville, ON  C26         NaN  (135)    male  \n",
       "3  Montreal, PQ / Chesterville, ON  C26         NaN    NaN  female  \n",
       "4  Montreal, PQ / Chesterville, ON  C22         NaN     11    male  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 2. 数据处理\n",
    "# 2.1 缺失值处理\n",
    "data.age.fillna(value=data.age.mean(), inplace=True)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.2 确定目标值和特征值\n",
    "x = data[['pclass', 'age', 'sex']]\n",
    "y = data['survived']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.3 数据划分\n",
    "x_train, x_test, y_train, y_test = train_test_split(x, y ,random_state=2, test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. 特征工程-特征提取\n",
    "# 3.1 创建字典特征提取转换器\n",
    "transfer = DictVectorizer()\n",
    "\n",
    "# 3.2 将需要标准化的数据类型转换为字典类型\n",
    "x_train = x_train.to_dict(orient='records')\n",
    "x_test = x_test.to_dict(orient='records')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3.3 标准化\n",
    "x_train = transfer.fit_transform(x_train)\n",
    "x_test = transfer.fit_transform(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier()"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 4. 机器学习-决策树算法\n",
    "# # 4.1.1 创建ID3决策树估计器\n",
    "# estimator = DecisionTreeClassifier(criterion='entropy')\n",
    "# # 4.2.2 创建CART决策树估计器\n",
    "# estimator = DecisionTreeClassifier()\n",
    "# # 4.2.3 随机森林估计器,因为要5折交叉验证，所以总共训练了6种数量的决策树*5种深度*5折=150次训练\n",
    "# # 交叉验证-网格搜索只能提高模型预测的稳定性，不能提高模型的准确率\n",
    "# estimator = RandomForestClassifier()\n",
    "# param_grid = {\"n_estimators\": [120,200,300,500,800,1200], \"max_depth\": [5, 8, 15, 25, 30]}\n",
    "# estimator = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5)\n",
    "# 4.2.4 不进行交叉训练网格搜索\n",
    "estimator = RandomForestClassifier()\n",
    "\n",
    "# 4.2 训练模型\n",
    "estimator.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "预测目标值和测试目标值对比:\n",
      " 811      True\n",
      "681      True\n",
      "757      True\n",
      "1223     True\n",
      "846      True\n",
      "        ...  \n",
      "129     False\n",
      "240     False\n",
      "665     False\n",
      "1271     True\n",
      "669      True\n",
      "Name: survived, Length: 263, dtype: bool\n",
      "模型准确率为:\n",
      " 0.8060836501901141\n"
     ]
    },
    {
     "ename": "AttributeError",
     "evalue": "'RandomForestClassifier' object has no attribute 'best_estimator_'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-23-86a2f63f958b>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      9\u001b[0m \u001b[1;31m# 5.3 最好的模型\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"最好的模型是:\\n\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbest_estimator_\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mAttributeError\u001b[0m: 'RandomForestClassifier' object has no attribute 'best_estimator_'"
     ]
    }
   ],
   "source": [
    "# 5. 模型评估\n",
    "# 5.1 预测目标值和测试目标值对比\n",
    "y_pre = estimator.predict(x_test)\n",
    "print(\"预测目标值和测试目标值对比:\\n\", y_pre == y_test)\n",
    "\n",
    "# 5.2 模型准确率\n",
    "print(\"模型准确率为:\\n\", estimator.score(x_test, y_test))\n",
    "\n",
    "# # 5.3 最好的模型（进行交叉验证-网格搜索时才用）\n",
    "# print(\"最好的模型是:\\n\", estimator.best_estimator_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
